#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <math.h>

const int ntpb = 512;

__device__ void swap(float *a, float *b) {
	const float c = *a;
	*a = *b;
	*b = c;
}

__device__ void heapify(float* arr, int size, int i) {
	int largest = i;
	int l = (i << 1) + 1; 
	int r = (i + 1) << 1;

	if (l < size && arr[l] > arr[largest]) {
		largest = l;
	}


	if (r < size && arr[r] > arr[largest]) {
		largest = r;
	}


	if (largest != i) {
		swap(&arr[largest], &arr[i]);
		heapify(arr, size, largest);
	}
}


__device__ void buildHeap(float *arr, int size) {

	for (int i = (size - 2) / 2; i >= 0; --i) {
		heapify(arr, size, i);
	}
}

__global__ void heapSortKernel(float *arr, int size) {
	__shared__ float s[ntpb];
	for (int i = threadIdx.x; i < ntpb; i += blockDim.x) {
		s[i] = arr[i];
	}
	__syncthreads();

	if (threadIdx.x == 0) {

		buildHeap(s, size);

		int heapSize = size;
		while (heapSize > 1) {

			swap(s, &s[heapSize - 1]);
			--heapSize;


			heapify(s, heapSize, 0);
		}
	}
	for (int i = threadIdx.x; i < ntpb; i += blockDim.x) {
		arr[i] = s[i];
	}
}

//Generates random numbers and assigns them to the array
void fillArray(int* arr, int size) {
	for (int i = 0; i < size; i++) {
		arr[i] = rand() % size;
	}
}

void print(int *arr, int size) {
	for (int i = 0; i < size; i++) {
		std::cout << arr[i] << " ";
	}
	std::cout << std::endl;
}

int main(int argc, char *argv[]) {
	
	int numOfElements = atoi(argv[1]);

	// Print the vector length to be used, and compute its size
	size_t size = numOfElements * sizeof(float);

	// Allocate the host input vector A
	int *heapArray = new int[numOfElements];

	// Allocate the device input vector A
	float *d_A;
	cudaMalloc((void **)&d_A, size);

	fillArray(heapArray, numOfElements);
	cudaMemcpy(d_A, heapArray, size, cudaMemcpyHostToDevice);

	std::cout << "Host Side:   ";
	print(heapArray, numOfElements);

	heapSortKernel<<<(numOfElements+ntpb-1)/numOfElements, ntpb >>>(d_A, numOfElements);
	cudaDeviceSynchronize();

	cudaMemcpy(heapArray, d_A, size, cudaMemcpyDeviceToHost);
	std::cout << "Device Side: ";
	print(heapArray, numOfElements);

	// Free device global memory
	cudaFree(d_A);
	// Free host memory
	delete [] heapArray;

	cudaDeviceReset();

	return 0;
}